library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels)
library(readxl)
library(lubridate)
library(ISLR)
library(vip)
library(cluster)
library(rpart.plot) # install.packages('rpart.plot')
tidymodels_prefer() # Resolves conflicts, prefers tidymodel functions
# Melbourne Housing Data
melb <- read_excel("melb_data.xlsx")
melbclean <- melb %>% select(-Longtitude, -Lattitude, -Address, -Postcode, -SellerG) %>%
mutate(Date = dmy(Date), logPrice = log(Price))
sapply(melbclean, function(x) sum(is.na(x)))
## Suburb Rooms Type Price Method
## 0 0 0 0 0
## Date Distance Bedroom2 Bathroom Car
## 0 0 0 0 62
## Landsize BuildingArea YearBuilt CouncilArea Regionname
## 0 6450 5375 1369 0
## Propertycount logPrice
## 0 0
melbclean <- melbclean %>% na.omit()
head(melbclean)
## # A tibble: 6 × 17
## Suburb Rooms Type Price Method Date Distance Bedroom2 Bathroom Car
## <chr> <dbl> <chr> <dbl> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 Abbotsf… 2 h 1.03e6 S 2016-02-04 2.5 2 1 0
## 2 Abbotsf… 3 h 1.46e6 SP 2017-03-04 2.5 3 2 0
## 3 Abbotsf… 4 h 1.6 e6 VB 2016-06-04 2.5 3 1 2
## 4 Abbotsf… 3 h 1.88e6 S 2016-05-07 2.5 4 2 0
## 5 Abbotsf… 2 h 1.64e6 S 2016-10-08 2.5 2 1 2
## 6 Abbotsf… 2 h 1.10e6 S 2016-10-08 2.5 3 1 2
## # … with 7 more variables: Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>,
## # CouncilArea <chr>, Regionname <chr>, Propertycount <dbl>, logPrice <dbl>
# Exploratory plots
# Right-skewed distribution
ggplot(melbclean, aes(x = Price)) +
geom_histogram(fill= 'lightblue') +
theme_classic() +
labs(x = "Price of houses in Melbourne (Australian dollars)", y = 'Count')
The distribution of house prices in Melbourne is right-skewed. The majority of the prices lie between 0 and 1500000.
It is also very interesting to see how distance from Sydney Central Business District affects the price
# Scatterplot of Price and Distance
ggplot(melbclean, aes(x = Distance, y = Price)) +
geom_point(color = 'pink') +
geom_smooth() +
theme_classic() +
labs(x = "Distance from Sydney Central Business District (km)", y = "Price of houses in Melbourne (Australian dollars)")
melbclean <- melbclean %>% filter(Price < 6000000)
After removing the Price outliers:
# Scatterplot of Price and Distance
ggplot(melbclean, aes(x = Distance, y = Price)) +
geom_point(color = 'pink') +
geom_smooth() +
theme_classic() +
labs(x = "Distance from Sydney Central Business District (km)", y = "Price of houses in Melbourne (Australian dollars)")
# Scatterplot of Year and Type of Houses
ggplot(melbclean, aes(x = Type, y = YearBuilt)) +
geom_boxplot(color = 'blue') +
geom_smooth() +
theme_classic() +
labs(x = "Type of Houses", y = "The Year the House is built")
melbclean <- melbclean %>% filter(YearBuilt > 1800)
After removing the YearBuilt outlier:
# Scatterplot of Year and Type of Houses
ggplot(melbclean, aes(x = Type, y = YearBuilt)) +
geom_boxplot(color = 'blue') +
geom_smooth() +
theme_classic() +
labs(x = "Type of Houses", y = "The Year the House is built")
melbclean_cv <- vfold_cv(melbclean, v = 10)
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
full_rec <- recipe(logPrice ~ ., data = melbclean) %>%
step_rm(CouncilArea, Price, Regionname, Method, Suburb, Date) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_dummy(all_nominal_predictors())
melb_wf <- workflow() %>%
add_recipe(full_rec) %>%
add_model(lm_spec)
mod_initial <- fit(melb_wf, data = melbclean)
mod1_cv <- fit_resamples(melb_wf,
resamples = melbclean_cv,
metrics = metric_set(rmse, rsq, mae)
)
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 mae standard 0.254 10 0.00287 Preprocessor1_Model1
## 2 rmse standard 0.326 10 0.00623 Preprocessor1_Model1
## 3 rsq standard 0.640 10 0.0106 Preprocessor1_Model1
set.seed(244)
lm_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>%
set_engine(engine = 'glmnet') %>%
set_mode('regression')
lasso_wf <- workflow() %>%
add_recipe(full_rec) %>%
add_model(lm_lasso_spec_tune)
penalty_grid <- grid_regular(
penalty(range = c(-5, 1)),
levels = 30)
lasso_fit_cv <- tune_grid( # new function for tuning hyperparameters
lasso_wf, # workflow
resamples = melbclean_cv, # folds
metrics = metric_set(rmse, mae, rsq),
grid = penalty_grid # penalty grid
)
lasso_fit_cv %>% select_best(metric = 'rmse') %>% arrange(desc(penalty))
## # A tibble: 1 × 2
## penalty .config
## <dbl> <fct>
## 1 0.00001 Preprocessor1_Model01
lasso_fit_cv %>% select_best(metric = 'rsq') %>% arrange(desc(penalty))
## # A tibble: 1 × 2
## penalty .config
## <dbl> <fct>
## 1 0.00001 Preprocessor1_Model01
lasso_fit_cv %>%
select_by_one_std_err(metric = 'rmse', desc(penalty))
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <fct> <dbl> <dbl>
## 1 0.0204 rmse standard 0.330 10 0.00576 Preprocessor1_Mod… 0.326 0.333
lasso_fit_cv %>%
select_by_one_std_err(metric = 'rsq', desc(penalty))
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <fct> <dbl> <dbl>
## 1 0.0329 rsq standard 0.633 10 0.0103 Preprocessor1_Mod… 0.640 0.630
best_penalty <- lasso_fit_cv %>%
select_by_one_std_err(metric = 'rmse', desc(penalty))
tuned_lasso_wf <- finalize_workflow(lasso_wf, best_penalty)
lm_lasso_spec <- tuned_lasso_wf %>% pull_workflow_spec() # Save final tuned model spec
lasso_fit_cv %>% autoplot() + theme_classic()
lasso_fit <- tuned_lasso_wf %>%
fit(data = melbclean)
tidy(lasso_fit) %>% arrange(desc(abs(estimate)))
## # A tibble: 12 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 13.8 0.0204
## 2 Type_u -0.260 0.0204
## 3 Rooms 0.134 0.0204
## 4 YearBuilt -0.129 0.0204
## 5 Distance -0.125 0.0204
## 6 Bathroom 0.110 0.0204
## 7 BuildingArea 0.102 0.0204
## 8 Car 0.0190 0.0204
## 9 Bedroom2 0 0.0204
## 10 Landsize 0 0.0204
## 11 Propertycount 0 0.0204
## 12 Type_t 0 0.0204
glmnet_output <- lasso_fit %>% extract_fit_parsnip() %>% pluck('fit') # way to get the original glmnet output
lambdas <- glmnet_output$lambda
coefs_lambdas <-
coefficients(glmnet_output, s = lambdas ) %>%
as.matrix() %>%
t() %>%
as.data.frame() %>%
mutate(lambda = lambdas ) %>%
select(lambda, everything(), -`(Intercept)`) %>%
pivot_longer(cols = -lambda,
names_to = "term",
values_to = "coef") %>%
mutate(var = map_chr(stringr::str_split(term,"_"),~.[1]))
coefs_lambdas %>%
ggplot(aes(x = lambda, y = coef, group = term, color = var)) +
geom_line() +
geom_vline(xintercept = best_penalty %>% pull(penalty), linetype = 'dashed') +
theme_classic() +
theme(legend.position = "bottom", legend.text=element_text(size=8))
glmnet_output_1 <- lasso_fit %>% extract_fit_engine()
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output_1$beta==0
# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
this_coeff_path <- bool_predictor_exclude[row,]
if(sum(this_coeff_path) == ncol(bool_predictor_exclude)){ return(0)}else{
return(ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1)}
})
# Create a dataset of this information and sort
var_imp_data <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
## # A tibble: 11 × 2
## var_name var_imp
## <chr> <dbl>
## 1 Rooms 61
## 2 BuildingArea 58
## 3 Type_u 58
## 4 Bathroom 55
## 5 YearBuilt 55
## 6 Distance 51
## 7 Car 39
## 8 Type_t 30
## 9 Landsize 28
## 10 Bedroom2 26
## 11 Propertycount 25
mod1_output <- mod_initial%>%
predict(new_data = melbclean) %>% #this function maintains the row order of the new_data
bind_cols(melbclean) %>%
mutate(resid = logPrice - .pred)
mod1_output %>% filter(resid < -2.5)
## # A tibble: 1 × 19
## .pred Suburb Rooms Type Price Method Date Distance Bedroom2 Bathroom
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <date> <dbl> <dbl> <dbl>
## 1 18.0 Camberw… 5 h 2.61e6 S 2016-10-15 7.8 5 2
## # … with 9 more variables: Car <dbl>, Landsize <dbl>, BuildingArea <dbl>,
## # YearBuilt <dbl>, CouncilArea <chr>, Regionname <chr>, Propertycount <dbl>,
## # logPrice <dbl>, resid <dbl>
mod1_output <- mod1_output %>%
mutate(colors = if_else(.pred > 17.8,1,0))
# Quick plot: Residuals vs. Fitted values
ggplot(mod1_output, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic() +
labs(x = "Fitted values", y = "Residuals")
# Residuals vs. Distance predictor
ggplot(mod1_output, aes(x= Distance, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic() +
labs(x = "Distance", y = "Residuals")
# Residuals vs. Room predictor
ggplot(mod1_output, aes(x= YearBuilt, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic() +
labs(x = "Year Built", y = "Residuals")
mod1_output %>% filter(.pred > 17)
## # A tibble: 1 × 20
## .pred Suburb Rooms Type Price Method Date Distance Bedroom2 Bathroom
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <date> <dbl> <dbl> <dbl>
## 1 18.0 Camberw… 5 h 2.61e6 S 2016-10-15 7.8 5 2
## # … with 10 more variables: Car <dbl>, Landsize <dbl>, BuildingArea <dbl>,
## # YearBuilt <dbl>, CouncilArea <chr>, Regionname <chr>, Propertycount <dbl>,
## # logPrice <dbl>, resid <dbl>, colors <dbl>
set.seed(123)
full_rec <- recipe(logPrice ~ ., data = melbclean) %>%
step_rm(CouncilArea,Price,Regionname,Method,Suburb,Date) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_dummy(all_nominal_predictors()) %>%
step_ns(Distance, deg_free = 5) %>%
step_ns(Bathroom, deg_free = 5)
# Workflow (Recipe + Model)
spline_wf <- workflow() %>%
add_recipe(full_rec) %>%
add_model(lm_spec)
# CV to Evaluate
cv_output <- fit_resamples(
spline_wf,
resamples = melbclean_cv, # cv folds
metrics = metric_set(mae,rsq,rmse)
)
cv_output %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 mae standard 0.253 10 0.00294 Preprocessor1_Model1
## 2 rmse standard 0.325 10 0.00625 Preprocessor1_Model1
## 3 rsq standard 0.645 10 0.00988 Preprocessor1_Model1
# Fit model
ns_mod <- spline_wf %>%
fit(data = melbclean)
ns_mod %>%
tidy()
## # A tibble: 20 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 14.1 0.158 89.3 0
## 2 Rooms 0.110 0.0141 7.80 7.08e- 15
## 3 Bedroom2 0.0199 0.0138 1.44 1.50e- 1
## 4 Car 0.0386 0.00473 8.15 4.27e- 16
## 5 Landsize 0.0118 0.00414 2.85 4.35e- 3
## 6 BuildingArea 0.107 0.00539 19.8 1.13e- 84
## 7 YearBuilt -0.135 0.00523 -25.9 1.78e-140
## 8 Propertycount -0.0110 0.00416 -2.64 8.38e- 3
## 9 Type_t -0.0729 0.0160 -4.55 5.51e- 6
## 10 Type_u -0.302 0.0133 -22.7 2.04e-109
## 11 Distance_ns_1 -0.271 0.0344 -7.86 4.55e- 15
## 12 Distance_ns_2 -0.271 0.0407 -6.65 3.21e- 11
## 13 Distance_ns_3 -0.833 0.0415 -20.0 1.23e- 86
## 14 Distance_ns_4 -1.01 0.0909 -11.1 3.61e- 28
## 15 Distance_ns_5 -0.894 0.0751 -11.9 2.84e- 32
## 16 Bathroom_ns_1 0.111 0.640 0.174 8.62e- 1
## 17 Bathroom_ns_2 -0.120 0.0992 -1.21 2.26e- 1
## 18 Bathroom_ns_3 0.614 0.224 2.74 6.15e- 3
## 19 Bathroom_ns_4 NA NA NA NA
## 20 Bathroom_ns_5 NA NA NA NA
spline_mod_output <- melbclean %>%
bind_cols(predict(ns_mod, new_data = melbclean)) %>%
mutate(resid = logPrice - .pred)
ggplot(spline_mod_output, aes(x = logPrice, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ns_mod %>%
tidy() %>%
mutate(absolute_estimate_spline = abs(estimate)) %>%
arrange(desc(absolute_estimate_spline))
## # A tibble: 20 × 6
## term estimate std.error statistic p.value absolute_estimate_spli…
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 14.1 0.158 89.3 0 14.1
## 2 Distance_ns_4 -1.01 0.0909 -11.1 3.61e- 28 1.01
## 3 Distance_ns_5 -0.894 0.0751 -11.9 2.84e- 32 0.894
## 4 Distance_ns_3 -0.833 0.0415 -20.0 1.23e- 86 0.833
## 5 Bathroom_ns_3 0.614 0.224 2.74 6.15e- 3 0.614
## 6 Type_u -0.302 0.0133 -22.7 2.04e-109 0.302
## 7 Distance_ns_2 -0.271 0.0407 -6.65 3.21e- 11 0.271
## 8 Distance_ns_1 -0.271 0.0344 -7.86 4.55e- 15 0.271
## 9 YearBuilt -0.135 0.00523 -25.9 1.78e-140 0.135
## 10 Bathroom_ns_2 -0.120 0.0992 -1.21 2.26e- 1 0.120
## 11 Bathroom_ns_1 0.111 0.640 0.174 8.62e- 1 0.111
## 12 Rooms 0.110 0.0141 7.80 7.08e- 15 0.110
## 13 BuildingArea 0.107 0.00539 19.8 1.13e- 84 0.107
## 14 Type_t -0.0729 0.0160 -4.55 5.51e- 6 0.0729
## 15 Car 0.0386 0.00473 8.15 4.27e- 16 0.0386
## 16 Bedroom2 0.0199 0.0138 1.44 1.50e- 1 0.0199
## 17 Landsize 0.0118 0.00414 2.85 4.35e- 3 0.0118
## 18 Propertycount -0.0110 0.00416 -2.64 8.38e- 3 0.0110
## 19 Bathroom_ns_4 NA NA NA NA NA
## 20 Bathroom_ns_5 NA NA NA NA NA
set.seed(123)
# Generalized Additive Regression (GAM) Model
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
fit_gam_model <- gam_spec %>% # can't use a recipe with gam (yet)
fit(Price ~ Type + s(Distance, k = 5) + s(Rooms, k = 5) + s(YearBuilt, k = 5) + s(Bathroom, k = 5), data = melbclean) # s() stands for splines
fit_gam_model %>% pluck('fit') %>% summary() # estimates generalized CV error to choose smoothness, if not specified
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Price ~ Type + s(Distance, k = 5) + s(Rooms, k = 5) + s(YearBuilt,
## k = 5) + s(Bathroom, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1131467 7961 142.120 < 2e-16 ***
## Typet -166749 22052 -7.562 4.56e-14 ***
## Typeu -212177 18970 -11.185 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Distance) 3.559 3.884 146.4 <2e-16 ***
## s(Rooms) 3.769 3.961 131.5 <2e-16 ***
## s(YearBuilt) 3.606 3.898 120.5 <2e-16 ***
## s(Bathroom) 3.772 3.959 236.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.545 Deviance explained = 54.6%
## GCV = 1.9284e+11 Scale est. = 1.9229e+11 n = 6190
Based on the output, it looks like Type u (unit, duplex) and Type t (townhouse) have on average a price that is lower by 167K and 213K respectively compared to Type h (house, cottage, villa, semi, terrace), keeping all other factors fixed.
Based on the smooth terms, Distance, Rooms, YearBuilt, Bathroom seem to be strong predictors after considering other variables in the model. They seem to be wiggly curves with edf ranging from 3 to 4.
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model %>% pluck('fit') %>% mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 12 iterations.
## The RMS GCV score gradient at convergence was 144146.1 .
## The Hessian was positive definite.
## Model rank = 19 / 19
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Distance) 4.00 3.56 0.82 <2e-16 ***
## s(Rooms) 4.00 3.77 0.99 0.15
## s(YearBuilt) 4.00 3.61 1.01 0.67
## s(Bathroom) 4.00 3.77 0.96 0.01 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The residual plots look fine. There are no extreme patterns in the Q-Q plot (they are close to the line, which suggests the residuals are approximately Normal). There is no obvious pattern in the residual vs. linear pred. plot. The histogram of the residuals is unimodal and right-skewed. Lastly, the response v. fitted values are positively correlated as they should be.
For the number of knots, we see the edf (effective degree of freedom) is much lower than the k’ = k - 1 and all of the p-values are quite large. This indicates that there are enough knots for the wiggliness of the relationships.
fit_gam_model %>% pluck('fit') %>% plot(pages = 1)
The relationships between Melbourne house price and the predictors look to be at least slightly nonlinear for the chosen predictors. This suggests that forcing linear relationships would not have been ideal.
# Exploratory plots
ggplot(melbclean, aes(x = BuildingArea, y = logPrice, color = Type)) +
geom_point() +
theme_classic() +
labs(color = 'Type of Houses')
# Remove outliers: Building Area that are greater than 1500
melbclean_classification <- melbclean %>% filter(BuildingArea < 1500)
# Change Type to factor
melbclean_classification <- melbclean_classification %>%
mutate(Type = factor(Type)) %>% #make sure outcome is factor
mutate(across(where(is.character), as.factor)) #change all character variables to factors
unique(melbclean_classification$Type)
## [1] h u t
## Levels: h t u
# Exploratory plots
ggplot(melbclean_classification, aes(x = BuildingArea, y = logPrice, color = Type)) +
geom_point() +
theme_classic() +
labs(color = 'Type of Houses')
ggplot(melbclean_classification, aes(x = Type, y = logPrice)) +
geom_boxplot() +
labs(x = "House Type", y = "Log of House Price") +
scale_fill_viridis_d() +
theme_classic() +
theme(text = element_text(size = 20))
ggplot(melbclean_classification, aes(x = Type, y = Distance)) +
geom_boxplot() +
labs(x = "House Type", y = "Distance") +
scale_fill_viridis_d() +
labs(x = 'House Type', y = 'Distance from Sydney Central Business District (km)') +
theme_classic() +
theme(text = element_text(size = 20))
ggplot(melbclean_classification, aes(x = Type, y = Landsize)) +
geom_boxplot() +
labs(x = "House Type", y = "Landsize") +
scale_fill_viridis_d() +
theme_classic() +
theme(text = element_text(size = 20))
melbclean_classification %>% group_by(Type) %>% summarize(count = n())
## # A tibble: 3 × 2
## Type count
## <fct> <int>
## 1 h 4081
## 2 t 602
## 3 u 1505
ct_spec <- decision_tree() %>%
set_engine(engine = 'rpart') %>%
set_args(cost_complexity = NULL, #default is 0.01 (used for pruning a tree)
min_n = NULL, #min number of observations to try split: default is 20 [I think the documentation has a typo and says 2] (used to stop early)
tree_depth = NULL) %>% #max depth (max number of splits to get to any final group: default is 30 (used to stop early)
set_mode('classification') # change this for regression tree
type_predict <- recipe(Type ~ ., data = melbclean_classification) %>%
#step_unknown(all_nominal_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors())
type_wf <- workflow() %>%
add_model(ct_spec) %>%
add_recipe(type_predict)
fit_type <- type_wf %>%
fit(data = melbclean_classification)
# Default tuning parameters (min_n = 20, depth = 30, cost_complexity = 0.01)
fit_type %>%
extract_fit_engine() %>%
rpart.plot()
type_output <- bind_rows(
predict(fit_type, new_data = melbclean_classification) %>% bind_cols(melbclean_classification %>% select(Type)) %>% mutate(model = 'orig'))
ct_metrics <- metric_set(sens, yardstick::spec, accuracy)
# No Information Rate (accuracy if you classified everyone using largest outcome group): Proportion of largest outcome group
melbclean_classification %>%
count(Type) %>%
mutate(prop = n/sum(n)) %>%
filter(prop == max(prop))
## # A tibble: 1 × 3
## Type n prop
## <fct> <int> <dbl>
## 1 h 4081 0.660
# Training Data Metrics
metrics <- type_output %>%
group_by(model) %>%
ct_metrics(estimate = .pred_class, truth = Type)
metrics %>% filter(.metric == 'accuracy') %>% arrange(desc(.estimate))
## # A tibble: 1 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 orig accuracy multiclass 0.848
metrics %>% filter(.metric == 'sens') %>% arrange(desc(.estimate))
## # A tibble: 1 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 orig sens macro 0.729
metrics %>% filter(.metric == 'spec') %>% arrange(desc(.estimate))
## # A tibble: 1 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 orig spec macro 0.906
metrics %>%
ggplot(aes(x = .metric, y = .estimate, color = model)) +
geom_point(size = 2) +
geom_line(aes(group = model)) +
theme_classic()
set.seed(123)
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(mtry = NULL,
trees = 1000,
min_n = 2,
probability = FALSE,
importance = 'impurity') %>%
set_mode('classification')
# Recipe
data_rec <- recipe(Type ~ ., data = melbclean_classification) %>% step_rm(Price)
# Workflows
data_wf <- workflow() %>%
add_model(rf_spec ) %>%
add_recipe(data_rec)
data_fit <- fit(data_wf, data = melbclean_classification)
rf_OOB_output <- function(fit_model, model_label, truth){
tibble(
.pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
class = truth,
model = model_label
)
}
data_rf_OOB_output <- bind_rows(
rf_OOB_output(data_fit,2, melbclean_classification %>% pull(Type))
)
data_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_rm()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000, min.node.size = min_rows(~2, x), probability = ~FALSE, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Classification
## Number of trees: 1000
## Sample size: 6188
## Number of independent variables: 15
## Mtry: 3
## Target node size: 2
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 11.88 %
model_output <-
data_fit %>%
extract_fit_engine()
model_output %>%
vip(num_features = 10) + theme_classic() #based on impurity
model_output %>% vip::vi() %>% head()
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 Landsize 586.
## 2 BuildingArea 430.
## 3 YearBuilt 397.
## 4 logPrice 355.
## 5 Rooms 245.
## 6 Bedroom2 218.
model_output %>% vip::vi() %>% tail()
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 Propertycount 102.
## 2 CouncilArea 87.7
## 3 Car 78.3
## 4 Regionname 51.1
## 5 Bathroom 49.1
## 6 Method 39.0
ggplot(melbclean, aes(x = Distance, y = BuildingArea)) +
geom_point(color = 'lightblue') +
theme_classic() +
labs (x = 'Building Area', y = 'Distance from Sydney Central Business District (km)')
melbclean_clustering <- melbclean %>%
mutate(across(where(is.character), as.factor))
melbclean_clustering_kmeans <- melbclean_clustering %>% # Change all character variables to factors
select(Rooms, YearBuilt, Distance, BuildingArea) # Select a subset of variables to perform k-means
# Data-specific function to cluster and calculate total within-cluster SS
house_cluster_ss <- function(k){
# Perform clustering
kclust <- kmeans(daisy(melbclean_clustering_kmeans), centers = k)
# Return the total within-cluster sum of squares
return(kclust$tot.withinss)
}
tibble(
k = 1:15,
tot_wc_ss = purrr::map_dbl(1:15, house_cluster_ss)
) %>%
ggplot(aes(x = k, y = tot_wc_ss)) +
geom_point() +
labs(x = "Number of clusters", y = 'Total within-cluster sum of squares') +
theme_classic()
Test with 3 and 5.
# Run k-means for k = centers = 3
set.seed(253)
kclust_k3_scale <- kmeans(daisy(melbclean_clustering_kmeans), centers = 3)
melbclean_clustering <- melbclean_clustering %>%
mutate(kclust_3_scale = factor(kclust_k3_scale$cluster))
# Visualize the cluster assignments with Distance and Building Area
ggplot(melbclean_clustering, aes(x = Distance, y = BuildingArea, color = kclust_3_scale)) +
geom_point() +
labs (x = 'Distance from Sydney Central Business District (km)', y = 'Building Area') +
theme_classic()
# Visualize the cluster assignments with Year Built and Building Area
ggplot(melbclean_clustering, aes(x = YearBuilt, y = BuildingArea, color = kclust_3_scale)) +
geom_point() +
labs (x = 'Year Built', y = 'Building Area') +
theme_classic()
# Visualize the cluster assignments with Type and Distance
ggplot(melbclean_clustering, aes(x = Type, y = Distance, color = kclust_3_scale)) +
geom_boxplot() +
labs (x = 'Type of Houses', y = 'Distance from Sydney Central Business District (km)') +
theme_classic()
# Run k-means for k = centers = 5
set.seed(253)
kclust_k5_scale <- kmeans(daisy(melbclean_clustering_kmeans), centers = 5)
melbclean_clustering <- melbclean_clustering %>%
mutate(kclust_5_scale = factor(kclust_k5_scale$cluster))
# Visualize the cluster assignments with Distance and Building Area
ggplot(melbclean_clustering, aes(x = Distance, y = BuildingArea, color = kclust_5_scale)) +
geom_point() +
labs (x = 'Distance from Sydney Central Business District (km)', y = 'Building Area') +
theme_classic()
# Visualize the cluster assignments with Year Built and Building Area
ggplot(melbclean_clustering, aes(x = YearBuilt, y = BuildingArea, color = kclust_5_scale)) +
geom_point() +
labs (x = 'Year Built', y = 'Building Area') +
theme_classic()
# Visualize the cluster assignments with Type and Distance
ggplot(melbclean_clustering, aes(x = Type, y = Distance, color = kclust_5_scale)) +
geom_boxplot() +
labs (x = 'Type of Houses', y = 'Distance from Sydney Central Business District (km)') +
theme_classic()
set.seed(253)
melbclean_clustering_1 <- melbclean %>%
mutate(across(where(is.character), as.factor)) %>% # Change all character variables to factors
select(-Date) # Remove the Date variable
house_sub <- melbclean_clustering_1 %>%
sample_n(50)
dist_mat_scaled <- dist(daisy(house_sub))
house_cluster <- hclust(dist_mat_scaled, method = "complete")
plot(house_cluster)
melbclust <- house_sub %>%
mutate(
hclust_area3 = factor(cutree(house_cluster, h = 3)),
hclust4 = cutree(house_cluster,k = 4))
ggplot(melbclust, aes(x= factor(hclust4), y = Distance))+
geom_boxplot()+
labs(x = "Cluster")+
theme_classic()
ggplot(melbclust, aes(x= factor(hclust4), y = Landsize))+
geom_boxplot()+
labs(x = "Cluster")+
theme_classic()
ggplot(melbclust, aes(x = factor(hclust4), fill = factor(Rooms)))+
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()
ggplot(melbclust, aes(x = factor(hclust4), fill = factor(Type)))+
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()